{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "d7aa1b0ad6979877450f9cd89e1e37289b51cf6e"
   },
   "source": [
    "# 推荐系统"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_uuid": "c1fdd129c1cbab68ae3e6bf2062575f01f80b87c"
   },
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: './movie/tmdb/tmdb_5000_credits.csv'",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mFileNotFoundError\u001B[0m                         Traceback (most recent call last)",
      "\u001B[1;32m<ipython-input-1-1f9d88939a23>\u001B[0m in \u001B[0;36m<module>\u001B[1;34m\u001B[0m\n\u001B[0;32m      6\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mpandas\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mpd\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      7\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mnumpy\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 8\u001B[1;33m \u001B[0mdf1\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mpd\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mread_csv\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'./movie/tmdb/tmdb_5000_credits.csv'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m      9\u001B[0m \u001B[0mdf2\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mpd\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mread_csv\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'./movie/tmdb/tmdb_5000_movies.csv'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m     10\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mc:\\users\\ginzeng\\.conda\\envs\\learn\\lib\\site-packages\\pandas\\io\\parsers.py\u001B[0m in \u001B[0;36mread_csv\u001B[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001B[0m\n\u001B[0;32m    686\u001B[0m     )\n\u001B[0;32m    687\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 688\u001B[1;33m     \u001B[1;32mreturn\u001B[0m \u001B[0m_read\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mfilepath_or_buffer\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mkwds\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    689\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    690\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mc:\\users\\ginzeng\\.conda\\envs\\learn\\lib\\site-packages\\pandas\\io\\parsers.py\u001B[0m in \u001B[0;36m_read\u001B[1;34m(filepath_or_buffer, kwds)\u001B[0m\n\u001B[0;32m    452\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    453\u001B[0m     \u001B[1;31m# Create the parser.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 454\u001B[1;33m     \u001B[0mparser\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mTextFileReader\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mfp_or_buf\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwds\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    455\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    456\u001B[0m     \u001B[1;32mif\u001B[0m \u001B[0mchunksize\u001B[0m \u001B[1;32mor\u001B[0m \u001B[0miterator\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mc:\\users\\ginzeng\\.conda\\envs\\learn\\lib\\site-packages\\pandas\\io\\parsers.py\u001B[0m in \u001B[0;36m__init__\u001B[1;34m(self, f, engine, **kwds)\u001B[0m\n\u001B[0;32m    946\u001B[0m             \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0moptions\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m\"has_index_names\"\u001B[0m\u001B[1;33m]\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mkwds\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m\"has_index_names\"\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    947\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 948\u001B[1;33m         \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_make_engine\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mengine\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    949\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    950\u001B[0m     \u001B[1;32mdef\u001B[0m \u001B[0mclose\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mc:\\users\\ginzeng\\.conda\\envs\\learn\\lib\\site-packages\\pandas\\io\\parsers.py\u001B[0m in \u001B[0;36m_make_engine\u001B[1;34m(self, engine)\u001B[0m\n\u001B[0;32m   1178\u001B[0m     \u001B[1;32mdef\u001B[0m \u001B[0m_make_engine\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mengine\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"c\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1179\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[0mengine\u001B[0m \u001B[1;33m==\u001B[0m \u001B[1;34m\"c\"\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1180\u001B[1;33m             \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_engine\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mCParserWrapper\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mf\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0moptions\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   1181\u001B[0m         \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1182\u001B[0m             \u001B[1;32mif\u001B[0m \u001B[0mengine\u001B[0m \u001B[1;33m==\u001B[0m \u001B[1;34m\"python\"\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mc:\\users\\ginzeng\\.conda\\envs\\learn\\lib\\site-packages\\pandas\\io\\parsers.py\u001B[0m in \u001B[0;36m__init__\u001B[1;34m(self, src, **kwds)\u001B[0m\n\u001B[0;32m   2008\u001B[0m         \u001B[0mkwds\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m\"usecols\"\u001B[0m\u001B[1;33m]\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0musecols\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2009\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2010\u001B[1;33m         \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_reader\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mparsers\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mTextReader\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msrc\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwds\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   2011\u001B[0m         \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0munnamed_cols\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_reader\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0munnamed_cols\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2012\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\parsers.pyx\u001B[0m in \u001B[0;36mpandas._libs.parsers.TextReader.__cinit__\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\parsers.pyx\u001B[0m in \u001B[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: './movie/tmdb/tmdb_5000_credits.csv'"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "数据下载地址: \n",
    "https://www.kaggle.com/tmdb/tmdb-movie-metadata\n",
    "https://www.kaggle.com/rounakbanik/the-movies-dataset\n",
    "\"\"\"\n",
    "import pandas as pd \n",
    "import numpy as np \n",
    "df1=pd.read_csv('./movie/tmdb/tmdb_5000_credits.csv')\n",
    "df2=pd.read_csv('./movie/tmdb/tmdb_5000_movies.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "402a28d17c13bba3f2060d72c2ff75f5377a9f01"
   },
   "source": [
    "**tmdb_5000_movies.csv 中共有 20 个字段，其各自释义如下：**\n",
    "\n",
    "* budget：预算\n",
    "* genres：分类\n",
    "* homepage：主页（大量缺失值，但不重要）\n",
    "* id：编号\n",
    "* keywords：关键词标签\n",
    "* original_language：原语言\n",
    "* original_title：原标题\n",
    "* overview：简介\n",
    "* popularity：流行度\n",
    "* production_companies：制作公司\n",
    "* production_countries：制作国家\n",
    "* release_date：上映日期\n",
    "* revenue：收益\n",
    "* runtime：时长\n",
    "* spoken_languages：配音语言\n",
    "* status：状态\n",
    "* tagline：一句话标语\n",
    "* title：题目\n",
    "* vote_average：平均分\n",
    "* vote_count：参与评分人数\n",
    "\n",
    "**tmdb_5000_credits.csv 中共有4 个字段，其各自释义如下：**\n",
    "* movie_id：编号\n",
    "* title：电影名称\n",
    "* cast：演员阵容\n",
    "* crew：全体人员\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "_uuid": "c87bda9d56a936be126d03eda0bc743ee35be461"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>budget</th>\n",
       "      <th>genres</th>\n",
       "      <th>homepage</th>\n",
       "      <th>id</th>\n",
       "      <th>keywords</th>\n",
       "      <th>original_language</th>\n",
       "      <th>original_title</th>\n",
       "      <th>overview</th>\n",
       "      <th>popularity</th>\n",
       "      <th>production_companies</th>\n",
       "      <th>...</th>\n",
       "      <th>runtime</th>\n",
       "      <th>spoken_languages</th>\n",
       "      <th>status</th>\n",
       "      <th>tagline</th>\n",
       "      <th>title</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>tittle</th>\n",
       "      <th>cast</th>\n",
       "      <th>crew</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>237000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://www.avatarmovie.com/</td>\n",
       "      <td>19995</td>\n",
       "      <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
       "      <td>en</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
       "      <td>150.437577</td>\n",
       "      <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
       "      <td>...</td>\n",
       "      <td>162.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
       "      <td>Released</td>\n",
       "      <td>Enter the World of Pandora.</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>7.2</td>\n",
       "      <td>11800</td>\n",
       "      <td>Avatar</td>\n",
       "      <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
       "      <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>300000000</td>\n",
       "      <td>[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...</td>\n",
       "      <td>http://disney.go.com/disneypictures/pirates/</td>\n",
       "      <td>285</td>\n",
       "      <td>[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...</td>\n",
       "      <td>en</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
       "      <td>139.082615</td>\n",
       "      <td>[{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"...</td>\n",
       "      <td>...</td>\n",
       "      <td>169.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>At the end of the world, the adventure begins.</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>6.9</td>\n",
       "      <td>4500</td>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...</td>\n",
       "      <td>[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>245000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://www.sonypictures.com/movies/spectre/</td>\n",
       "      <td>206647</td>\n",
       "      <td>[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...</td>\n",
       "      <td>en</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>A cryptic message from Bond’s past sends him o...</td>\n",
       "      <td>107.376788</td>\n",
       "      <td>[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...</td>\n",
       "      <td>...</td>\n",
       "      <td>148.0</td>\n",
       "      <td>[{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...</td>\n",
       "      <td>Released</td>\n",
       "      <td>A Plan No One Escapes</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>6.3</td>\n",
       "      <td>4466</td>\n",
       "      <td>Spectre</td>\n",
       "      <td>[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...</td>\n",
       "      <td>[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>250000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...</td>\n",
       "      <td>http://www.thedarkknightrises.com/</td>\n",
       "      <td>49026</td>\n",
       "      <td>[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...</td>\n",
       "      <td>en</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>Following the death of District Attorney Harve...</td>\n",
       "      <td>112.312950</td>\n",
       "      <td>[{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"...</td>\n",
       "      <td>...</td>\n",
       "      <td>165.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>The Legend Ends</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>7.6</td>\n",
       "      <td>9106</td>\n",
       "      <td>The Dark Knight Rises</td>\n",
       "      <td>[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...</td>\n",
       "      <td>[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>260000000</td>\n",
       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
       "      <td>http://movies.disney.com/john-carter</td>\n",
       "      <td>49529</td>\n",
       "      <td>[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...</td>\n",
       "      <td>en</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>John Carter is a war-weary, former military ca...</td>\n",
       "      <td>43.926995</td>\n",
       "      <td>[{\"name\": \"Walt Disney Pictures\", \"id\": 2}]</td>\n",
       "      <td>...</td>\n",
       "      <td>132.0</td>\n",
       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}]</td>\n",
       "      <td>Released</td>\n",
       "      <td>Lost in our world, found in another.</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>6.1</td>\n",
       "      <td>2124</td>\n",
       "      <td>John Carter</td>\n",
       "      <td>[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...</td>\n",
       "      <td>[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      budget                                             genres  \\\n",
       "0  237000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "1  300000000  [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...   \n",
       "2  245000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "3  250000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...   \n",
       "4  260000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
       "\n",
       "                                       homepage      id  \\\n",
       "0                   http://www.avatarmovie.com/   19995   \n",
       "1  http://disney.go.com/disneypictures/pirates/     285   \n",
       "2   http://www.sonypictures.com/movies/spectre/  206647   \n",
       "3            http://www.thedarkknightrises.com/   49026   \n",
       "4          http://movies.disney.com/john-carter   49529   \n",
       "\n",
       "                                            keywords original_language  \\\n",
       "0  [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...                en   \n",
       "1  [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...                en   \n",
       "2  [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...                en   \n",
       "3  [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...                en   \n",
       "4  [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...                en   \n",
       "\n",
       "                             original_title  \\\n",
       "0                                    Avatar   \n",
       "1  Pirates of the Caribbean: At World's End   \n",
       "2                                   Spectre   \n",
       "3                     The Dark Knight Rises   \n",
       "4                               John Carter   \n",
       "\n",
       "                                            overview  popularity  \\\n",
       "0  In the 22nd century, a paraplegic Marine is di...  150.437577   \n",
       "1  Captain Barbossa, long believed to be dead, ha...  139.082615   \n",
       "2  A cryptic message from Bond’s past sends him o...  107.376788   \n",
       "3  Following the death of District Attorney Harve...  112.312950   \n",
       "4  John Carter is a war-weary, former military ca...   43.926995   \n",
       "\n",
       "                                production_companies  ... runtime  \\\n",
       "0  [{\"name\": \"Ingenious Film Partners\", \"id\": 289...  ...   162.0   \n",
       "1  [{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"...  ...   169.0   \n",
       "2  [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...  ...   148.0   \n",
       "3  [{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"...  ...   165.0   \n",
       "4        [{\"name\": \"Walt Disney Pictures\", \"id\": 2}]  ...   132.0   \n",
       "\n",
       "                                    spoken_languages    status  \\\n",
       "0  [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...  Released   \n",
       "1           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "2  [{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...  Released   \n",
       "3           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "4           [{\"iso_639_1\": \"en\", \"name\": \"English\"}]  Released   \n",
       "\n",
       "                                          tagline  \\\n",
       "0                     Enter the World of Pandora.   \n",
       "1  At the end of the world, the adventure begins.   \n",
       "2                           A Plan No One Escapes   \n",
       "3                                 The Legend Ends   \n",
       "4            Lost in our world, found in another.   \n",
       "\n",
       "                                      title vote_average vote_count  \\\n",
       "0                                    Avatar          7.2      11800   \n",
       "1  Pirates of the Caribbean: At World's End          6.9       4500   \n",
       "2                                   Spectre          6.3       4466   \n",
       "3                     The Dark Knight Rises          7.6       9106   \n",
       "4                               John Carter          6.1       2124   \n",
       "\n",
       "                                     tittle  \\\n",
       "0                                    Avatar   \n",
       "1  Pirates of the Caribbean: At World's End   \n",
       "2                                   Spectre   \n",
       "3                     The Dark Knight Rises   \n",
       "4                               John Carter   \n",
       "\n",
       "                                                cast  \\\n",
       "0  [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...   \n",
       "1  [{\"cast_id\": 4, \"character\": \"Captain Jack Spa...   \n",
       "2  [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...   \n",
       "3  [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...   \n",
       "4  [{\"cast_id\": 5, \"character\": \"John Carter\", \"c...   \n",
       "\n",
       "                                                crew  \n",
       "0  [{\"credit_id\": \"52fe48009251416c750aca23\", \"de...  \n",
       "1  [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...  \n",
       "2  [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...  \n",
       "3  [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...  \n",
       "4  [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.columns = ['id','tittle','cast','crew']\n",
    "df2= df2.merge(df1, on='id') # join 操作\n",
    "df2.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "ee603279675033fc397f0c94738e20b34f35312b"
   },
   "source": [
    "# 电影评分 - 粗排\n",
    "\n",
    "我们需要一个指标来给电影打分\n",
    "计算每部电影的分数\n",
    "对评分进行排序，并向用户推荐最佳评分的电影。\n",
    "\n",
    "> 可以使用电影的平均评分作为分数，但使用这个分数是不够公平的，因为一部平均评分为8.9且只有3票的电影不能被认为比平均评分为7.8但有40票的电影更好。\n",
    "\n",
    "where\n",
    "\n",
    "* v 是电影的票数\n",
    "* m 是需要在图表中列出的最低投票数;\n",
    "* R 电影平均分\n",
    "* C 所有电影的平均分\n",
    "\n",
    "**score** = (v/(v+m) * R) + (m/(m+v) * C)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "_uuid": "5799b99c5e5ed5b7723ae8b31e1fc9fb1e7b89ec"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.09\n",
      "1838.4\n"
     ]
    }
   ],
   "source": [
    "### 所有电影的平均分- 平均\n",
    "C= round(df2['vote_average'].mean() ,2)\n",
    "print(C)\n",
    "\n",
    "# 分位数 - [0.9] \n",
    "m= round(df2['vote_count'].quantile(0.9),2)\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "b77dea5a38ca2c399e3abeac1487e784fe146078"
   },
   "source": [
    "#### 过滤数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "_uuid": "a22008df6d81d3b716d39a56efd3d547345bfbce"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(481, 23)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q_movies = df2.copy().loc[df2['vote_count'] >= m]\n",
    "q_movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "_uuid": "bb680ed0fb1c3020785d34152c57c6e2279d4424"
   },
   "outputs": [],
   "source": [
    "# 计算电影评分\n",
    "def weighted_rating(x, m=m, C=C):\n",
    "    v = x['vote_count']\n",
    "    R = x['vote_average']\n",
    "    # Calculation based on the IMDB formula\n",
    "    return (v/(v+m) * R) + (m/(m+v) * C)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "_uuid": "d2d189929715237ab19a18fb8747239b86092968"
   },
   "outputs": [],
   "source": [
    "# 每个电影都进行评分\n",
    "q_movies['score'] = q_movies.apply(weighted_rating, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "4cdd60c146173606146ec4fc3a1c9d8c184cb81c"
   },
   "source": [
    "**最后，让我们根据分数特性对DataFrame进行排序，并输出前10部电影的标题、投票计数、投票平均和加权评分或分数。**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "_uuid": "a9a9fc3810ea67c31908bbdf8bb930daa918102b"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1881</th>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>8205</td>\n",
       "      <td>8.5</td>\n",
       "      <td>8.058860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>662</th>\n",
       "      <td>Fight Club</td>\n",
       "      <td>9413</td>\n",
       "      <td>8.3</td>\n",
       "      <td>7.938901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>12002</td>\n",
       "      <td>8.2</td>\n",
       "      <td>7.919732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3232</th>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>8428</td>\n",
       "      <td>8.3</td>\n",
       "      <td>7.904256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>Inception</td>\n",
       "      <td>13752</td>\n",
       "      <td>8.1</td>\n",
       "      <td>7.862983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3337</th>\n",
       "      <td>The Godfather</td>\n",
       "      <td>5893</td>\n",
       "      <td>8.4</td>\n",
       "      <td>7.850720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>Interstellar</td>\n",
       "      <td>10867</td>\n",
       "      <td>8.1</td>\n",
       "      <td>7.809164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>809</th>\n",
       "      <td>Forrest Gump</td>\n",
       "      <td>7927</td>\n",
       "      <td>8.2</td>\n",
       "      <td>7.802779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>329</th>\n",
       "      <td>The Lord of the Rings: The Return of the King</td>\n",
       "      <td>8064</td>\n",
       "      <td>8.1</td>\n",
       "      <td>7.726840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990</th>\n",
       "      <td>The Empire Strikes Back</td>\n",
       "      <td>5879</td>\n",
       "      <td>8.2</td>\n",
       "      <td>7.697366</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              title  vote_count  vote_average  \\\n",
       "1881                       The Shawshank Redemption        8205           8.5   \n",
       "662                                      Fight Club        9413           8.3   \n",
       "65                                  The Dark Knight       12002           8.2   \n",
       "3232                                   Pulp Fiction        8428           8.3   \n",
       "96                                        Inception       13752           8.1   \n",
       "3337                                  The Godfather        5893           8.4   \n",
       "95                                     Interstellar       10867           8.1   \n",
       "809                                    Forrest Gump        7927           8.2   \n",
       "329   The Lord of the Rings: The Return of the King        8064           8.1   \n",
       "1990                        The Empire Strikes Back        5879           8.2   \n",
       "\n",
       "         score  \n",
       "1881  8.058860  \n",
       "662   7.938901  \n",
       "65    7.919732  \n",
       "3232  7.904256  \n",
       "96    7.862983  \n",
       "3337  7.850720  \n",
       "95    7.809164  \n",
       "809   7.802779  \n",
       "329   7.726840  \n",
       "1990  7.697366  "
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据评分进行排序\n",
    "q_movies = q_movies.sort_values('score', ascending=False)\n",
    "\n",
    "# TOP 15 影片\n",
    "q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "_uuid": "207f7058f92698b5fd776f7771a3ac0cc2928bf1"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>popularity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>546</th>\n",
       "      <td>Minions</td>\n",
       "      <td>875.581305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>Interstellar</td>\n",
       "      <td>724.247784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>788</th>\n",
       "      <td>Deadpool</td>\n",
       "      <td>514.569956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>Guardians of the Galaxy</td>\n",
       "      <td>481.098624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>Mad Max: Fury Road</td>\n",
       "      <td>434.278564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>Jurassic World</td>\n",
       "      <td>418.708552</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       title  popularity\n",
       "546                  Minions  875.581305\n",
       "95              Interstellar  724.247784\n",
       "788                 Deadpool  514.569956\n",
       "94   Guardians of the Galaxy  481.098624\n",
       "127       Mad Max: Fury Road  434.278564\n",
       "28            Jurassic World  418.708552"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAywAAAEWCAYAAACE8BN/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAkW0lEQVR4nO3de5xeVX3v8c+XBIVwixIKhSDxgmIMGDBYizcQe6pVRFuvpRUoR6ovqdLKOVAvCB714OXUirR4kAIiQUBQi+jxAoIiApIQSLgEbbkUuYhBQCGIAr/zx7NGHoaZzAyZ5NljPu/XK6/svfbaa6/9sF8P+c5aa0+qCkmSJEnqovUG3QFJkiRJGo2BRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSRomyUlJPjzofoxXkhcnuW7Q/ZCkNcHAIknqtCQ3Jrk/yb1JftbCxMaD7tdEtHv4TZJZw8qXJKkkc1an/aq6sKqetVqdlKSOMrBIkqaCvapqY2AXYAHw/gH3Z0TpGe3/rTcAb+mruyMwY610TJKmMAOLJGnKqKpbgP8HzANI8pokVye5O8kFSZ49VLeNavxjkmuS3JXkxCQbtGP7JflBf9ttpOMZw6+Z5ElJzkny89bOOUlm9x2/IMlHklwErASeNkr3vwC8tW9/X+DkYdfaLMnJ7Vo3JXl/kvWSPLHd47y+ulu0kac/SLJ7kp/2Hds6yVmtnRuSvKvv2POTLEryyzZi9U+jf+KSNHgGFknSlJFkW+DPgCVJngl8ETgY2AL4BvC1JE/oO2Uf4E+BpwPP5PGNzKwHnAhsBzwFuB84ZlidvwYOBDYBbhqlnUuATZM8O8k04M3AKcPqfAbYjF7oeSm9gLN/VT0AfJm+ERrgjcD3quqO/gbaCM/XgCuBbYA9gYOT/Gmr8mng01W1Kb3P5YyxPgBJGiQDiyRpKvhqkruBHwDfAz4KvAn4elV9p6p+C3wS2BDYre+8Y6rq5qr6BfARHv0P/nGpqjur6qyqWllVv2rtvHRYtZOq6uqqerD1ZTRDoyx/AlwL3DJ0oC/E/GNV/aqqbgT+D70wBHBqOz7kL1vZcLsCW1TVh6rqN1V1PfC5vnN/CzwjyayqureqLhnzQ5CkAZo+6A5IkjQOr62qc/sLkmxN32hGVT2c5GZ6owpDbu7bvgnYeqIXTjID+BTwCuBJrXiTJNOq6qERrrMqXwC+DzyVYdPBgFnA+jx6hOYmHrmf84EZSf4I+BkwH/jKCNfYDti6Bbwh04AL2/YBwIeA5UluAI6sqnPG2X9JWusMLJKkqepWYMehnSQBtqVv1KLtD3lKOwfgPvoWvCfZahXXeQ/wLOCPqur2JPOBJUD66tR4OlxVN7WQ8Gf0gkO/FfRGP7YDrunr8y3t3IeSnEFvlOhnwDltxGe4m4Ebqmr7UfrwE+AtberYnwNnJtm8qu4bzz1I0trmlDBJ0lR1BvCqJHsmWZ9esHgA+GFfnXcmmZ3kycD7gNNb+ZXAc5LMbwvxj1jFdTaht27l7tbOB1ez3wcALxseENpozRnAR5JskmQ74B949DqXU+lNhduHkaeDAfwI+FWSQ5NsmGRaknlJdgVI8ldJtqiqh4G72zkPr+Y9SdIaY2CRJE1JVXUd8Ff0FqqvAPai9/rj3/RVOxX4NnA98J/Ah9u5P6Y3Lepc4Cf01saM5p/prY1ZQW/h/DdXs9//WVWLRjn8d/RGf65vfToVOKHv3Evb8a3pvS1tpPYfAl5Nb8rYDa3fx9NbzA+9qW1XJ7mX3gL8N1fV/atzT5K0JqVqXKPYkiRNKUluBP778LUvkqSpxREWSZIkSZ1lYJEkSZLUWU4JkyRJktRZjrBIkiRJ6ix/D4tGNWvWrJozZ86guyFJkqTfc4sXL15RVVuMdMzAolHNmTOHRYtGe/OmJEmSNDmS3DTaMaeESZIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzvIXR2pUt698kKOWrBh0NyRJkrSGHbbzrEF3YVSOsEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysKxFSSrJKX3705P8PMk5bf81SQ4bo42tk5y5pvsqSZIkdcH0QXdgHXMfMC/JhlV1P/AnwC1DB6vqbODsVTVQVbcCr1+jvZQkSZI6whGWte8bwKva9luALw4dSLJfkmPa9klJjk7ywyTXJ3l9K5+T5Kq2vUGSE5MsS7IkyR597Xw5yTeT/CTJx1v5tNbuVe2cv1+L9y1JkiRNmCMsa99pwOFtGthOwAnAi0ep+4fAi4Ad6I28DJ8K9k6gqmrHJDsA307yzHZsPrAz8ABwXZLPAH8AbFNV8wCSzBx+wSQHAgcCzNxq9uO8RUmSJGlyOMKyllXVUmAOvdGVb4xR/atV9XBVXQNsOcLxFwGntHaXAzcBQ4HlvKq6p6p+DVwDbAdcDzwtyWeSvAL45Qj9O66qFlTVgo2etPnEb1CSJEmaRAaWwTgb+CR908FG8UDfdiZ4jf5zHwKmV9VdwHOBC4C3A8dPsE1JkiRprTKwDMYJwJFVtWw127kQ2AegTQV7CnDdaJWTzALWq6qzgPcDu6zm9SVJkqQ1yjUsA1BVPwWOnoSm/hU4Nsky4EFgv6p6IBl1MGYb4MQkQ0H1HyehD5IkSdIak6oadB/UUbPnzq+DFp476G5IkiRpDTts51kDvX6SxVW1YKRjTgmTJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmd5W+616i2mjF94L9ESJIkSes2R1gkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZZvCdOobl/5IEctWTHobkiSpCnEN4xqsjnCIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAMgFJ7h1HnYOTzJik6+2eZLdx1NsvyTFt+4gkh0zG9SVJkqRBM7BMvoOBCQWWJNNGObQ7MGZgWR1Jpq/J9iVJkqTVYWB5HNrIxwVJzkyyPMnC9LwL2Bo4P8n5re5/S3JxksuTfCnJxq38xiQfS3I58IYk70pyTZKlSU5LMgd4O/D3Sa5I8uIkWyQ5K8ll7c8Lx+jn21q9K9t5M1r5SUk+m+RS4ONr8KOSJEmSVos/XX/8dgaeA9wKXAS8sKqOTvIPwB5VtSLJLOD9wMur6r4khwL/AHyotXFnVe0CkORW4KlV9UCSmVV1d5LPAvdW1SdbnVOBT1XVD5I8BfgW8OxV9PHLVfW5du6HgQOAz7Rjs4Hdquqh/hOSHAgcCDBzq9mr8fFIkiRJq8/A8vj9qKp+CpDkCmAO8INhdV4AzAUuSgLwBODivuOn920vBRYm+Srw1VGu+XJgbmsLYNOhEZtRzGtBZSawMb2AM+RLw8MKQFUdBxwHMHvu/FpF25IkSdIaZ2B5/B7o236IkT/LAN+pqreM0sZ9fduvAl4C7AW8L8mOI9RfD3hBVf36URd5JMAMdxLw2qq6Msl+9NbEjHRtSZIkqZNcwzL5fgVs0rYvAV6Y5BkASTZK8szhJyRZD9i2qs4HDgU2ozci0t8WwLeBv+s7b/4YfdkEuC3J+sA+j+tuJEmSpAEysEy+44BvJjm/qn4O7Ad8MclSetPBdhjhnGnAKUmWAUuAo6vqbuBrwOuGFt0D7wIWtIX519BblL8qHwAupbfGZvnq35okSZK0dqXKZQoa2ey58+ughecOuhuSJGkKOWznWYPugqagJIurasFIxxxhkSRJktRZBhZJkiRJnWVgkSRJktRZBhZJkiRJnWVgkSRJktRZBhZJkiRJnWVgkSRJktRZBhZJkiRJnTV90B1Qd201Y7q//EmSJEkD5QiLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM7yLWEa1e0rH+SoJSsG3Q1J0iTy7Y+SphpHWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCRJkiR1loFFkiRJUmcZWCZJkoeSXJHk6iRXJnlPkkn5fJMckeSQyWirr839khwzmW1KkiRJk236oDvwe+T+qpoPkOQPgFOBTYEPDrJTkiRJ0lTmCMsaUFV3AAcCB6VnWpJPJLksydIkfwuQZOMk5yW5PMmyJHsPtZHkfUl+nOQHwLP6yi9I8uk2mnNVkue38icn+Wpr/5IkO62qXJIkSZoKHGFZQ6rq+iTTgD8A9gbuqapdkzwRuCjJt4GbgddV1S+TzAIuSXI2sAvwZmA+vf9GlwOL+5qfUVXzk7wEOAGYBxwJLKmq1yZ5GXByO3+08hElOZBe2GLmVrMn58OQJEmSHicDy9rx34Cdkry+7W8GbA/8FPhoCx4PA9sAWwIvBr5SVSsBWojp90WAqvp+kk2TzAReBPxFK/9uks2TbLqK8hFV1XHAcQCz586v1b5zSZIkaTUYWNaQJE8DHgLuAAL8XVV9a1id/YAtgOdV1W+T3AhsMI7mhwcJg4UkSZJ+L7mGZQ1IsgXwWeCYqirgW8A7kqzfjj8zyUb0RlruaGFlD2C71sT3gdcm2TDJJsBewy7xptbOi+hNNbsHuBDYp5XvDqyoql+uolySJEnqPEdYJs+GSa4A1gceBL4A/FM7djwwB7g8SYCfA68FFgJfS7IMWAQsB6iqy5OcDlxJb4TmsmHX+nWSJe1af9PKjgBOSLIUWAnsO0a5JEmS1HnpDQBoqkhyAXBIVS1a09eaPXd+HbTw3DV9GUnSWnTYzrMG3QVJeowki6tqwUjHnBImSZIkqbOcEjbFVNXug+6DJEmStLY4wiJJkiSpswwskiRJkjrLwCJJkiSpswwskiRJkjrLwCJJkiSpswwskiRJkjrL1xprVFvNmO4vGJMkSdJAOcIiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbN8S5hGdfvKBzlqyYpBd0OS1im+nVGSHs0RFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FljBpYkWyY5Ncn1SRYnuTjJ69ZEZ5LsnuSctv2aJIetieuM0YcdklyRZEmSpw879t6+7TlJrlrNa70iyY+SLG/XPD3JU8Y454gkh6zOdSVJkqSpYpWBJUmArwLfr6qnVdXzgDcDsyfj4kmmjXasqs6uqqMm4zoT9FrgzKrauar+c9ix945Q/3FJMg/4DLBvVe1QVfOBhcCcybqGJEmSNNWNNcLyMuA3VfXZoYKquqmqPgOQZL8kxwwdS3JOkt3b9rFJFiW5OsmRfXVuTPKxJJcDb2ijDMvb/p/31ftd20n2SnJpG/U4N8mWrfyIJCckuaCNAL2rlW+U5OtJrkxyVZI3Db+xJPOTXJJkaZKvJHlSkj8DDgbekeT8YfWPAjZsIyELW/G0JJ9r9/jtJBu2uk9P8s02InVhkh1G+GwPBT5aVdf2fbZnV9X3WxtvS3JZu4ezkswY4R5GrJPk35O8tW3/bZKFrU+X9527ff++JEmS1EVjBZbnAI/3H7Xvq6oFwE7AS5Ps1Hfszqrahd7ozeeAvYDnAVuN0tYPgBdU1c7AacD/7Du2A/CnwPOBDyZZH3gFcGtVPbeq5gHfHKHNk4FDq2onYBnwwar6BvBZ4FNVtUd/5ao6DLi/quZX1T6teHvgX6rqOcDdwF+08uOAv2sjUocA/zrC9cf6bL9cVbtW1XOBa4EDJlDnQODwJC8G3tP68p/APUnmtzr7AycObzDJgS1oLrrvrjtX0T1JkiRpzZvQovsk/9J+mn/ZOKq/sf0Efwm9f5zP7Tt2evt7B+CGqvpJVRVwyihtzQa+lWQZ8D9ae0O+XlUPVNUK4A5gS3oB5E/aSM6Lq+qeYfexGTCzqr7Xij4PvGQc9zTcDVV1RdteDMxJsjGwG/ClJFcA/xf4w1U1kmTzNnLz4771KfPa6MwyYJ9h98yq6lTVz4DDgfOB91TVL1r944H921S8NwGnDm+wqo6rqgVVtWCjJ20+zo9BkiRJWjPGCixXA7sM7VTVO4E9gS1a0YPD2tgAIMlT6Y0s7NlGML4+dKy5b4L9/AxwTFXtCPztsLYe6Nt+CJheVT9u/V4GfDjJ4RO83ng95tr0Po+720jM0J9nj3Du7z7bqrqzrWE5Dti4HT8JOKjd85E8+p4ZR50dgTuBrfvKzgJeCbwaWFxVDqFIkiSp08YKLN8FNkjyjr6y/rUUNwLzk6yXZFt607IANqUXSu5p601eOUr7y+mNSgy9jesto9TbDLilbe87Rp9JsjWwsqpOAT5BX+gCaCMud7UpUwB/DXyPsf22TTkbVVX9ErghyRtaX5LkuSNU/TjwviT9Yab/s90EuK1dbx9GNmKdJM+n95nvDBzSAiRV9WvgW8CxjDAdTJIkSeqaVQaWNk3rtfTWoNyQ5Ef0pk8d2qpcBNwAXAMcTVuTUVVX0psKtpzetKOLRmn/1/TWW3y9TR+7Y5SuHEFvitViYMU47mtH4EdtStYHgQ+PUGdf4BNJlgLzgQ+No93jgKV9i+5Hsw9wQJIr6Y2k7D28QlUtA94NnJzkuiQXAc/mkWlaHwAupffZLR/lOo+pk+SJ9NYF/U1V3UpvDcsJSdLOWQg8DHx77NuVJEmSBiu9TKJ1RVsjs1lVfWCsurPnzq+DFp67FnolSRpy2M6zBt0FSVrrkixuL+x6jOlruzManCRfAZ5O73XVkiRJUucZWNYhVfW6QfdBkiRJmogJvdZYkiRJktYmA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzvK1xhrVVjOm+wvMJEmSNFCOsEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM4ysEiSJEnqLAOLJEmSpM7ytcYa1e0rH+SoJSsG3Q1JWit8jbskdZMjLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6ayCBJUklOaVvf3qSnyc5Z4LtXJBkwSjl/5UkfWVfTXLv6vV81D5cl+SK9uf1a+AaD7W2r0rytSQzJ6ndSf88JEmSpMk0qBGW+4B5STZs+38C3DLJ17gbeCFA+wf+H05y+/32qar57c+Z4zkhyfQJtH9/a3se8AvgnY+rl5IkSdIUM8gpYd8AXtW23wJ8cehAkucnuTjJkiQ/TPKsVr5hktOSXJvkK8CGj232d04D3ty2/xz4cl/7Gyc5L8nlSZYl2buV75pkaZINkmyU5Ook8yZ6Y0lO6h9pGRrJSLJ7kguTnA1ck+RDSQ7uq/eRJO8eo/mLgW1a/flJLml9/kqSJ7XytyW5LMmVSc5KMqOVP7V9rsuSfHii9yVJkiStbYMMLKcBb06yAbATcGnfseXAi6tqZ+Bw4KOt/B3Ayqp6NvBB4HmraP884CVJptELLqf3Hfs18Lqq2gXYA/g/SVJVlwFnAx8GPg6cUlVXASS5YhXXWtg3JWzzMe57F+DdVfVM4ATgra399Vo/TxntxHYve7Y+ApwMHFpVOwHL6H0mAF+uql2r6rnAtcABrfzTwLFVtSNw2yjXODDJoiSL7rvrzjFuRZIkSVqzJjItaVJV1dIkc+iNrnxj2OHNgM8n2R4oYP1W/hLg6L7zl67iEg8BP6AXAjasqhv7l7QAH03yEuBheiMWWwK3Ax8CLqMXat7V19/5q7jWPlW16HeNP3Kdkfyoqm5obd6Y5M4kO7frL6mqkVLChi0wbUMvgHwnyWbAzKr6XqvzeeBLbXteG0GZCWwMfKuVvxD4i7b9BeBjwy9UVccBxwHMnju/VnUjkiRJ0po26LeEnQ18kr7pYM3/As5vazb2AjZ4nO2fRi/gnDGsfB9gC+B5LYj8rO8am9P7R/4mq3HdB2mfbRs5eULfsfuG1T0e2A/Yn96Iy0jub/3cjl7YGmsNy0nAQW0k5UgefR+GEEmSJE0Zgw4sJwBHVtWyYeWb8cgi/P36yr8P/CVAW1uy0xjtXwj8bx4biDYD7qiq3ybZg14QGPJ/gQ8ACxlhBGKcbuSR6Wqv4ZERopF8BXgFsCuPjISMqKpW0hv1eQ+94HNXkhe3w38NDI22bALclmR9euFsyEU8sq6nv1ySJEnqpIEGlqr6aVUdPcKhjwP/O8kSHj1t7Vhg4yTX0pu6tXiM9quqPllVK4YdWggsSLKM3hqS5QBJ3gr8tqpOBY4Cdk3ysnbsignc2ueAlya5EvhjHjuq0t/H3wDnA2dU1UNjNVxVS4Cl9KbS7Qt8ok2Nm0/vM4Fe4LqUXkBZ3nf6u4F3tvveZgL3I0mSJA1EqpwhNEhtytjlwBuq6ieD7k+/2XPn10ELzx10NyRprThs51mD7oIkrbOSLK6qx/x+RRj8lLB1WpK5wH8A53UtrEiSJEldMLC3hAmq6hrgaYPuhyRJktRVjrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iwDiyRJkqTOMrBIkiRJ6iwDiyRJkqTO8rXGGtVWM6b7i9QkSZI0UI6wSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzjKwSJIkSeosA4skSZKkzvK1xhrV7Ssf5KglKwbdDUl6XHwtuyT9fnCERZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnGVgkSZIkdZaBRZIkSVJnrXOBJcm9A7jmD8dZb2aSO5Ok7f9xkkoyu+1vluQXScb13y3JnCRXjXLsgiQLxnsPkiRJ0iCsc4FlIpJMm4x2qmq3cda7G7gNeHYr2g1Y0v4GeAHwo6p6eKy2kkyfeE8lSZKkblknA0uS3ZOc07d/TJL92vaNST6W5HLgDUneluSyJFcmOSvJjFbvDUmuauXfb2XPSfKjJFckWZpk+1Z+b9+1Dk2yrJ131Ajd+yGPBJTdgE8N278oyQZJTmztLEmyR2t7vyRnJ/kucN6we94wyWlJrk3yFWDD1fwYJUmSpDXOn8KP7M6q2gUgyeZV9bm2/WHgAOAzwOHAn1bVLUlmtvPeDny6qhYmeQLwqBGaJK8E9gb+qKpWJnnyCNe+CHgpcDzwNOBLwN+2Y7sBRwHvBKqqdkyyA/DtJM9sdXYBdqqqXySZ09fuO4CVVfXsJDsBl49040kOBA4EmLnV7LE+J0mSJGmNWidHWMbh9L7teUkuTLIM2Ad4Tiu/CDgpydt4JJhcDLw3yaHAdlV1/7B2Xw6cWFUrAarqFyNc+4fAbkmeCtxYVb8GkmRj4HnApcCLgFNaG8uBm4ChwPKdUdp9Sd85S4GlI914VR1XVQuqasFGT9p8pCqSJEnSWrOuBpYHefS9bzDs+H192ycBB1XVjsCRQ3Wr6u3A+4FtgcVtJOZU4DXA/cA3krxsoh2rqp8AM4G96AUggMXA/vQCzFgvDbhvjOOSJEnSlLGuBpabgLlJntimc+25irqbALclWZ/eCAsASZ5eVZdW1eHAz4FtkzwNuL6qjgb+HdhpWFvfAfbvWwcz0pQwgEuAd/NIYLkYOJjeqA7AhUN9aVPBngJcN8Y9fx/4y3bOvBH6JkmSJHXOOhVY2puzHqiqm4EzgKva30tWcdoH6E3DughY3lf+ibbo/Sp607iuBN4IXJXkCmAecHJ/Q1X1TeBsYFGrc8go17yI3sjNorZ/Mb31LEOvR/5XYL02Te10YL+qemCVNw/HAhsnuRb4EL1RG0mSJKnTUlWD7sNak+S5wOeq6vmD7stUMHvu/Dpo4bmD7oYkPS6H7Txr0F2QJI1TksVVNeLvCFxnRliSvB34Ir11J5IkSZKmgHXmtcZV9Vngs4PuhyRJkqTxW2dGWCRJkiRNPQYWSZIkSZ1lYJEkSZLUWQYWSZIkSZ1lYJEkSZLUWQYWSZIkSZ21zrzWWBO31Yzp/uI1SZIkDZQjLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6y8AiSZIkqbMMLJIkSZI6K1U16D6oo5L8Crhu0P3QlDcLWDHoTmjK8znS6vIZ0mTwOVpztquqLUY6MH1t90RTynVVtWDQndDUlmSRz5FWl8+RVpfPkCaDz9FgOCVMkiRJUmcZWCRJkiR1loFFq3LcoDug3ws+R5oMPkdaXT5Dmgw+RwPgontJkiRJneUIiyRJkqTOMrBIkiRJ6iwDi0aU5BVJrkvyH0kOG3R/1E1Jtk1yfpJrklyd5N2t/MlJvpPkJ+3vJ7XyJDm6PVdLk+wy2DtQlySZlmRJknPa/lOTXNqel9OTPKGVP7Ht/0c7PmegHVdnJJmZ5Mwky5Ncm+SP/T7SRCT5+/b/s6uSfDHJBn4XDZ6BRY+RZBrwL8ArgbnAW5LMHWyv1FEPAu+pqrnAC4B3tmflMOC8qtoeOK/tQ++Z2r79ORA4du13WR32buDavv2PAZ+qqmcAdwEHtPIDgLta+adaPQng08A3q2oH4Ln0nie/jzQuSbYB3gUsqKp5wDTgzfhdNHAGFo3k+cB/VNX1VfUb4DRg7wH3SR1UVbdV1eVt+1f0/nGwDb3n5fOt2ueB17btvYGTq+cSYGaSP1y7vVYXJZkNvAo4vu0HeBlwZqsy/Dkaer7OBPZs9bUOS7IZ8BLg3wCq6jdVdTd+H2lipgMbJpkOzABuw++igTOwaCTbADf37f+0lUmjakPhOwOXAltW1W3t0O3Alm3bZ0uj+WfgfwIPt/3Ngbur6sG23/+s/O45asfvafW1bnsq8HPgxDa18PgkG+H3kcapqm4BPgn8F72gcg+wGL+LBs7AImm1JdkYOAs4uKp+2X+seu9O9/3pGlWSVwN3VNXiQfdFU9p0YBfg2KraGbiPR6Z/AX4fadXa+qa96YXfrYGNgFcMtFMCDCwa2S3Atn37s1uZ9BhJ1qcXVhZW1Zdb8c+Gpla0v+9o5T5bGskLgdckuZHeFNSX0VuLMLNNy4BHPyu/e47a8c2AO9dmh9VJPwV+WlWXtv0z6QUYv480Xi8Hbqiqn1fVb4Ev0/t+8rtowAwsGsllwPbtrRhPoLfg7OwB90kd1Obq/htwbVX9U9+hs4F92/a+wL/3lb+1vZ3nBcA9fVM1tI6qqn+sqtlVNYfe9813q2of4Hzg9a3a8Odo6Pl6favvT83XcVV1O3Bzkme1oj2Ba/D7SOP3X8ALksxo/38beob8Lhowf9O9RpTkz+jNKZ8GnFBVHxlsj9RFSV4EXAgs45G1B++lt47lDOApwE3AG6vqF+1/AMfQG2JfCexfVYvWesfVWUl2Bw6pqlcneRq9EZcnA0uAv6qqB5JsAHyB3pqpXwBvrqrrB9RldUiS+fRe3PAE4Hpgf3o/nPX7SOOS5EjgTfTegrkE+O/01qr4XTRABhZJkiRJneWUMEmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJEmS1FkGFkmSJEmdZWCRJK1TkjyU5IokVyX5UpIZk9z+BUkWTPCcDyV5eds+eLL7JElTmYFFkrSuub+q5lfVPOA3wNsH2Zkk06rq8Ko6txUdDBhYJKkxsEiS1mUXAs9I8uQkX02yNMklSXYCSHJEki8kuTjJT5K8rZXvnuScoUaSHJNkv+GNJzk2yaIkV7dfSDdUfmOSjyW5HHhDkpOSvD7Ju4CtgfOTnJ/kb5L8c995b0vyqTX1YUhSFxlYJEnrpCTTgVcCy4AjgSVVtRPwXuDkvqo7AS8D/hg4PMnWE7jM+6pqQWvjpUNBqLmzqnapqtOGCqrqaOBWYI+q2oPeb2jfK8n6rcr+wAkTuU9JmuoMLJKkdc2GSa4AFgH/Bfwb8CLgCwBV9V1g8ySbtvr/XlX3V9UK4Hzg+RO41hvbKMoS4DnA3L5jp491clXdC3wXeHWSHYD1q2rZBK4vSVPe9EF3QJKktez+qprfX5BkVfVrhP0HefQP/TYYflKSpwKHALtW1V1JThpW775x9vd4eqM+y4ETx3mOJP3ecIRFkqTeWpZ9oLc+BVhRVb9sx/ZOskGSzYHdgcuAm4C5SZ6YZCaw5whtbkovlNyTZEt608/G41fAJkM7VXUpsC3wl8AXJ3RXkvR7wBEWSZLgCOCEJEuBlcC+fceW0psKNgv4X1V1K0CSM4CrgBvoTfl6lKq6MskSeiMjNwMXjbMvxwHfTHJrW8cCvbUs86vqronemCRNdakaPtItSZKg95Yw4N6q+uSA+3EO8KmqOm+Q/ZCkQXBKmCRJHZVkZpIf01t3Y1iRtE5yhEWSJElSZznCIkmSJKmzDCySJEmSOsvAIkmSJKmzDCySJEmSOsvAIkmSJKmz/j9NmRg8doJDIwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 864x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 流行度排行榜\n",
    "pop= df2.sort_values('popularity', ascending=False)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "plt.figure(figsize=(12,4))\n",
    "\n",
    "plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',\n",
    "        color='skyblue')\n",
    "\n",
    "plt.gca().invert_yaxis()\n",
    "plt.xlabel(\"Popularity\")\n",
    "plt.title(\"Popular Movies\")\n",
    "\n",
    "# top - 6\n",
    "pop[['title','popularity']].head(6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "fe716df6e5e5a354ac53d556087147c0a64df2cc"
   },
   "source": [
    "# **基于内容的过滤**\n",
    "\n",
    "在这个推荐系统中，电影的内容(概述、演员、工作人员、关键字、口号等)被用来寻找与其他电影的相似之处。然后推荐最有可能相似的电影。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "b0a813c803b0ba1f0204188ab2a63dc7f59ce2eb"
   },
   "source": [
    "## **基于情节描述的推荐**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "_uuid": "5e676c38ace04a24205b76b16dac0fa3e058027f"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    In the 22nd century, a paraplegic Marine is di...\n",
       "1    Captain Barbossa, long believed to be dead, ha...\n",
       "2    A cryptic message from Bond’s past sends him o...\n",
       "3    Following the death of District Attorney Harve...\n",
       "4    John Carter is a war-weary, former military ca...\n",
       "Name: overview, dtype: object"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2['overview'].head(5) # 电影描述"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "_uuid": "a92da8cde39c61deef5a1b8efa31ed84cda7f5fe"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4803, 20978)"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# TF_IDF 向量\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "# 删除所有英文停止字，如'the'， 'a'\n",
    "tfidf = TfidfVectorizer(stop_words='english')\n",
    "\n",
    "#  用空字符串替换NaN\n",
    "df2['overview'] = df2['overview'].fillna('')\n",
    "\n",
    "# 构建稀疏矩阵\n",
    "tfidf_matrix = tfidf.fit_transform(df2['overview'])\n",
    "\n",
    "#Output the shape of tfidf_matrix\n",
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "6bde57434bf9a0e8f8b229d36901d75b77ff962f"
   },
   "source": [
    "![](https://tse4-mm.cn.bing.net/th/id/OIP-C.YcWzdyViv7PWlgrqa5pV8wAAAA?pid=ImgDet&rs=1)\n",
    "\n",
    "**余弦相似度计算公式:**\n",
    "<img src=\"https://tse1-mm.cn.bing.net/th/id/R-C.927ecc4e209d20051cd76d56d9d87af8?rik=jAE%2fe1T05k0tpw&riu=http%3a%2f%2fwww.au92.com%2fimages%2f2020%2f01%2fcos.png&ehk=bZEp0rhRQxkXCSALWHyyXQiFmKo0JSS8D3uXTs9jO1A%3d&risl=&pid=ImgRaw&r=0&sres=1&sresct=1\" width=\"500\" height=\"400\"/> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "_uuid": "5eb17d12220eecab4faf01bbfd13e79d8e446537"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "\n",
    "# 计算矩阵的相似度\n",
    "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)\n",
    "cosine_sim[:1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 每行包含其与所有行的相似度"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "f5ca112fbbe25b11f0f3356a31d1604727242700"
   },
   "source": [
    "**我们将定义一个函数，该函数以电影标题作为输入并输出10个最相似电影的列表。**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "_uuid": "55df2df36be98e6dec5f617a5aa51b77c500faa4"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "title\n",
       "Avatar                                      0\n",
       "Pirates of the Caribbean: At World's End    1\n",
       "Spectre                                     2\n",
       "The Dark Knight Rises                       3\n",
       "John Carter                                 4\n",
       "dtype: int64"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获取电影所在的行号\n",
    "indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()\n",
    "\n",
    "indices.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "da5896c6ccfd44c3347af3097275d0aa707c1001"
   },
   "source": [
    "现在，我们可以很好地定义推荐功能。遵循以下步骤:\n",
    "\n",
    "+ 1. 根据标题获得电影的索引。\n",
    "+ 2. 获取该特定电影与所有电影的余弦相似度得分列表。 将其转换为元组列表，其中第一个元素是其位置，第二个元素是相似性分数。\n",
    "+ 3. 根据相似度分数对上述元组列表进行排序。\n",
    "+ 4. 获取此列表的前10个元素。忽略第一个元素（与特定电影最相似的电影是电影本身）。\n",
    "+ 5. 返回与顶部元素索引相对应的电影标题。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "_uuid": "9c383fcbb916dce464b01adf980d26ad96aebe0e"
   },
   "outputs": [],
   "source": [
    "# 获取推荐结果\n",
    "def get_recommendations(title, cosine_sim=cosine_sim):\n",
    "    # 获取标题的索引\n",
    "    idx = indices[title]\n",
    "\n",
    "    # 获取相似度序号及相似度值\n",
    "    sim_scores = list(enumerate(cosine_sim[idx]))\n",
    "\n",
    "    #  排序 = 降序排\n",
    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "\n",
    "    # 获取前10 相似度\n",
    "    sim_scores = sim_scores[1:11]\n",
    "    movie_indices = [i[0] for i in sim_scores]\n",
    "\n",
    "    return df2['title'].iloc[movie_indices] # 返回title"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seasons = ['Spring', 'Summer', 'Fall', 'Winter']\n",
    "list(enumerate(seasons))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "_uuid": "14d722124f82e69cb444adcc589e396c75cbb4ff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65                              The Dark Knight\n",
       "299                              Batman Forever\n",
       "428                              Batman Returns\n",
       "1359                                     Batman\n",
       "3854    Batman: The Dark Knight Returns, Part 2\n",
       "119                               Batman Begins\n",
       "2507                                  Slow Burn\n",
       "9            Batman v Superman: Dawn of Justice\n",
       "1181                                        JFK\n",
       "210                              Batman & Robin\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Dark Knight Rises')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "_uuid": "902b9f1ab91921889c85e9008818dcc0b4710ccd"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7               Avengers: Age of Ultron\n",
       "3144                            Plastic\n",
       "1715                            Timecop\n",
       "4124                 This Thing of Ours\n",
       "3311              Thank You for Smoking\n",
       "3033                      The Corruptor\n",
       "588     Wall Street: Money Never Sleeps\n",
       "2136         Team America: World Police\n",
       "1468                       The Fountain\n",
       "1286                        Snowpiercer\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Avengers')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "fcfe9db9c2fdd9334538256d233c6acf33c1c049"
   },
   "source": [
    "# 基于荣誉，电影类型和关键字的推荐\n",
    "显然，使用更好的元数据将提高我们推荐程序的质量。这正是我们在本节中要做的。我们将基于以下元数据构建推荐系统：3个顶级演员，导演，相关流派和电影情节关键字。从演员，剧组和关键字中，我们需要提取三个最重要的演员，导演和与该电影相关的关键字。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "_uuid": "59a8d0991e3cae9a44a4b351e154fd1000724448"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...\n",
       "1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n",
       "2    [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...\n",
       "3    [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...\n",
       "4    [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...\n",
       "Name: genres, dtype: object"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将字符串化特性解析为它们对应的python对象\n",
    "from ast import literal_eval\n",
    "\n",
    "features = ['cast', 'crew', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df2[feature] = df2[feature].apply(literal_eval)\n",
    "\n",
    "df2[feature].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "_uuid": "783b0e89f1c04a12ff51eb29cc68e93c818896cd"
   },
   "outputs": [],
   "source": [
    "# 返回导演名称, 空 =  NaN\n",
    "def get_director(x):\n",
    "    for i in x:\n",
    "        if i['job'] == 'Director':\n",
    "            return i['name']\n",
    "    return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "_uuid": "86c4e9f4e6ef1e5ff287f58f3a1119fbddbdae09"
   },
   "outputs": [],
   "source": [
    "# 返回列表前3 个元素; whichever is more.\n",
    "def get_list(x):\n",
    "    if isinstance(x, list):\n",
    "        names = [i['name'] for i in x]\n",
    "        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
    "        if len(names) > 3:\n",
    "            names = names[:3]\n",
    "        return names\n",
    "\n",
    "    return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "_uuid": "dd060c3c1d724de71555218f30cccafd4a8ad6af"
   },
   "outputs": [],
   "source": [
    "# 以合适的形式定义新的导演、演员阵容、类型和关键字。\n",
    "df2['director'] = df2['crew'].apply(get_director)\n",
    "\n",
    "features = ['cast', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df2[feature] = df2[feature].apply(get_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "_uuid": "87a96f835470aa3df590b74322c2717ff529d6ae"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>cast</th>\n",
       "      <th>director</th>\n",
       "      <th>keywords</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Avatar</td>\n",
       "      <td>[Sam Worthington, Zoe Saldana, Sigourney Weaver]</td>\n",
       "      <td>James Cameron</td>\n",
       "      <td>[culture clash, future, space war]</td>\n",
       "      <td>[Action, Adventure, Fantasy]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Pirates of the Caribbean: At World's End</td>\n",
       "      <td>[Johnny Depp, Orlando Bloom, Keira Knightley]</td>\n",
       "      <td>Gore Verbinski</td>\n",
       "      <td>[ocean, drug abuse, exotic island]</td>\n",
       "      <td>[Adventure, Fantasy, Action]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Spectre</td>\n",
       "      <td>[Daniel Craig, Christoph Waltz, Léa Seydoux]</td>\n",
       "      <td>Sam Mendes</td>\n",
       "      <td>[spy, based on novel, secret agent]</td>\n",
       "      <td>[Action, Adventure, Crime]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      title  \\\n",
       "0                                    Avatar   \n",
       "1  Pirates of the Caribbean: At World's End   \n",
       "2                                   Spectre   \n",
       "\n",
       "                                               cast        director  \\\n",
       "0  [Sam Worthington, Zoe Saldana, Sigourney Weaver]   James Cameron   \n",
       "1     [Johnny Depp, Orlando Bloom, Keira Knightley]  Gore Verbinski   \n",
       "2      [Daniel Craig, Christoph Waltz, Léa Seydoux]      Sam Mendes   \n",
       "\n",
       "                              keywords                        genres  \n",
       "0   [culture clash, future, space war]  [Action, Adventure, Fantasy]  \n",
       "1   [ocean, drug abuse, exotic island]  [Adventure, Fantasy, Action]  \n",
       "2  [spy, based on novel, secret agent]    [Action, Adventure, Crime]  "
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Print the new features of the first 3 films\n",
    "df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "0bcb2c2e99ffd9ce73205c9c6ef6687d16caa31f"
   },
   "source": [
    "下一步是将名称和关键字实例转换为小写并去除它们之间的所有空格。 这样做是为了使矢量化程序不会将“ Johnny Depp”和“ Johnny Galecki”中的Johnny视为相同变量。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "_uuid": "86af764c406a8b6184b37b57cfe499d20ce45f9c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "johnnydepp\n"
     ]
    }
   ],
   "source": [
    "# 转小写去空格\n",
    "def clean_data(x):\n",
    "    if isinstance(x, list):\n",
    "        return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
    "    else:\n",
    "        #Check if director exists. If not, return empty string\n",
    "        if isinstance(x, str):\n",
    "            return str.lower(x.replace(\" \", \"\"))\n",
    "        else:\n",
    "            return ''\n",
    "\n",
    "print(clean_data('Johnny Depp'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "_uuid": "5728cc017ff6ed1dcd79da05b1dd57a60557e853"
   },
   "outputs": [],
   "source": [
    "# 清理数据\n",
    "features = ['cast', 'keywords', 'director', 'genres']\n",
    "\n",
    "for feature in features:\n",
    "    df2[feature] = df2[feature].apply(clean_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cast</th>\n",
       "      <th>keywords</th>\n",
       "      <th>director</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[samworthington, zoesaldana, sigourneyweaver]</td>\n",
       "      <td>[cultureclash, future, spacewar]</td>\n",
       "      <td>jamescameron</td>\n",
       "      <td>[action, adventure, fantasy]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[johnnydepp, orlandobloom, keiraknightley]</td>\n",
       "      <td>[ocean, drugabuse, exoticisland]</td>\n",
       "      <td>goreverbinski</td>\n",
       "      <td>[adventure, fantasy, action]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[danielcraig, christophwaltz, léaseydoux]</td>\n",
       "      <td>[spy, basedonnovel, secretagent]</td>\n",
       "      <td>sammendes</td>\n",
       "      <td>[action, adventure, crime]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[christianbale, michaelcaine, garyoldman]</td>\n",
       "      <td>[dccomics, crimefighter, terrorist]</td>\n",
       "      <td>christophernolan</td>\n",
       "      <td>[action, crime, drama]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[taylorkitsch, lynncollins, samanthamorton]</td>\n",
       "      <td>[basedonnovel, mars, medallion]</td>\n",
       "      <td>andrewstanton</td>\n",
       "      <td>[action, adventure, sciencefiction]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            cast  \\\n",
       "0  [samworthington, zoesaldana, sigourneyweaver]   \n",
       "1     [johnnydepp, orlandobloom, keiraknightley]   \n",
       "2      [danielcraig, christophwaltz, léaseydoux]   \n",
       "3      [christianbale, michaelcaine, garyoldman]   \n",
       "4    [taylorkitsch, lynncollins, samanthamorton]   \n",
       "\n",
       "                              keywords          director  \\\n",
       "0     [cultureclash, future, spacewar]      jamescameron   \n",
       "1     [ocean, drugabuse, exoticisland]     goreverbinski   \n",
       "2     [spy, basedonnovel, secretagent]         sammendes   \n",
       "3  [dccomics, crimefighter, terrorist]  christophernolan   \n",
       "4      [basedonnovel, mars, medallion]     andrewstanton   \n",
       "\n",
       "                                genres  \n",
       "0         [action, adventure, fantasy]  \n",
       "1         [adventure, fantasy, action]  \n",
       "2           [action, adventure, crime]  \n",
       "3               [action, crime, drama]  \n",
       "4  [action, adventure, sciencefiction]  "
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2[features].head() # 查看效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "_uuid": "20aef87703c408926f7617573ed043605207767f"
   },
   "outputs": [],
   "source": [
    "def create_soup(x):\n",
    "    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])\n",
    "df2['soup'] = df2.apply(create_soup, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "7b79886883806b8fb58098f9f803dabeaa0cadf6"
   },
   "source": [
    "后续步骤与我们对基于内容的推荐算法所做的相同。 一个重要的区别是我们使用CountVectorizer 而不是TF-IDF。 这是因为我们不希望减轻演员/导演在相对较多的电影中所扮演或导演的影响力。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "_uuid": "b66a1afc1083917d5ef136ccdcd9b50cca087e2b"
   },
   "outputs": [],
   "source": [
    "# 导入CountVectorizer并创建计数矩阵\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "count = CountVectorizer(stop_words='english')\n",
    "count_matrix = count.fit_transform(df2['soup'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "_uuid": "3fa5539ed1680ed5323f8351ac7e4840f629e958"
   },
   "outputs": [],
   "source": [
    "# 基于count_matrix计算余弦相似度矩阵\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "_uuid": "b2b8565a04f4bda92d3ba9d15c348af1cd8f8b4d"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "title\n",
       "Avatar                                      0\n",
       "Pirates of the Caribbean: At World's End    1\n",
       "Spectre                                     2\n",
       "The Dark Knight Rises                       3\n",
       "John Carter                                 4\n",
       "dtype: int64"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重置主DataFrame的索引，并像前面一样构造反向映射\n",
    "df2 = df2.reset_index()\n",
    "indices = pd.Series(df2.index, index=df2['title'])\n",
    "\n",
    "indices.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "_uuid": "d1e0e02be7a9e71422d3a492834cb4f8434d1464"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65               The Dark Knight\n",
       "119                Batman Begins\n",
       "4638    Amidst the Devil's Wings\n",
       "1196                The Prestige\n",
       "3073           Romeo Is Bleeding\n",
       "3326              Black November\n",
       "1503                      Takers\n",
       "1986                      Faster\n",
       "303                     Catwoman\n",
       "747               Gangster Squad\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Dark Knight Rises', cosine_sim2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "_uuid": "d6c4df85a80d830b2905f69e0e59ebb3461db3b7"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "867      The Godfather: Part III\n",
       "2731      The Godfather: Part II\n",
       "4638    Amidst the Devil's Wings\n",
       "2649           The Son of No One\n",
       "1525              Apocalypse Now\n",
       "1018             The Cotton Club\n",
       "1170     The Talented Mr. Ripley\n",
       "1209               The Rainmaker\n",
       "1394               Donnie Brasco\n",
       "1850                    Scarface\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Godfather', cosine_sim2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "71b15b5c090694303fa5e8d67b8bf394e07f45d6"
   },
   "source": [
    "# **协同过滤**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "4307f75107f9c5e5f911d52a6f1dc5530990c75e"
   },
   "source": [
    "### **奇异值分解**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "_uuid": "9a7faf48bf42293d18b29efac95e15010f6c900e"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>31</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1260759144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1029</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1260759179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1061</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1260759182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1129</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1260759185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1172</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1260759205</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating   timestamp\n",
       "0       1       31     2.5  1260759144\n",
       "1       1     1029     3.0  1260759179\n",
       "2       1     1061     3.0  1260759182\n",
       "3       1     1129     2.0  1260759185\n",
       "4       1     1172     4.0  1260759205"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from surprise import Reader, Dataset, SVD\n",
    "from surprise.model_selection import cross_validate\n",
    "\n",
    "reader = Reader()\n",
    "ratings = pd.read_csv('./movie/ratings_small.csv')\n",
    "ratings.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "f8a5cd5580510c27846a564bfa6d13f1a6dfa6de"
   },
   "source": [
    "请注意，与之前的数据集不同，在这个数据集中，电影是按照5个等级进行评级的。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "_uuid": "75166cecd9821bab4299605b66ea2a7787a4c3b7"
   },
   "outputs": [],
   "source": [
    "data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "_uuid": "17880aee6e7750afed98002593251010dcf5fb20"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'test_rmse': array([0.89845754, 0.87851544, 0.90687626, 0.88875854, 0.90646164]),\n",
       " 'test_mae': array([0.69207854, 0.67794004, 0.69724481, 0.68496575, 0.69589722]),\n",
       " 'fit_time': (4.268375635147095,\n",
       "  4.251009225845337,\n",
       "  4.2722389698028564,\n",
       "  3.6795027256011963,\n",
       "  3.63496994972229),\n",
       " 'test_time': (0.12253570556640625,\n",
       "  0.12330198287963867,\n",
       "  0.12207889556884766,\n",
       "  0.09984803199768066,\n",
       "  0.10153317451477051)}"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svd = SVD()\n",
    "cross_validate(svd, data, measures=['RMSE', 'MAE'],cv = 5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_uuid": "81f6d5f460d4cbaaa43b6bb86a0abd9bce1a3134"
   },
   "source": [
    "我们得到的平均均方根误差约为0.89，这对我们的情况来说已经足够好了。现在让我们在我们的数据集上进行训练，并得出预测结果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "_uuid": "58007ee500ce1735d173c247d188a5cd603b803c"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14c51df10>"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainset = data.build_full_trainset()\n",
    "svd.fit(trainset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "_uuid": "5a526fddacb2f7234e524e71224fdd1aecdd6ec0"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>4.0</td>\n",
       "      <td>835355493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>5.0</td>\n",
       "      <td>835355681</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>2</td>\n",
       "      <td>39</td>\n",
       "      <td>5.0</td>\n",
       "      <td>835355604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>2</td>\n",
       "      <td>47</td>\n",
       "      <td>4.0</td>\n",
       "      <td>835355552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>2</td>\n",
       "      <td>50</td>\n",
       "      <td>4.0</td>\n",
       "      <td>835355586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>2</td>\n",
       "      <td>592</td>\n",
       "      <td>5.0</td>\n",
       "      <td>835355395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>2</td>\n",
       "      <td>593</td>\n",
       "      <td>3.0</td>\n",
       "      <td>835355511</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>2</td>\n",
       "      <td>616</td>\n",
       "      <td>3.0</td>\n",
       "      <td>835355932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>2</td>\n",
       "      <td>661</td>\n",
       "      <td>4.0</td>\n",
       "      <td>835356141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>2</td>\n",
       "      <td>720</td>\n",
       "      <td>4.0</td>\n",
       "      <td>835355978</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>76 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    userId  movieId  rating  timestamp\n",
       "20       2       10     4.0  835355493\n",
       "21       2       17     5.0  835355681\n",
       "22       2       39     5.0  835355604\n",
       "23       2       47     4.0  835355552\n",
       "24       2       50     4.0  835355586\n",
       "..     ...      ...     ...        ...\n",
       "91       2      592     5.0  835355395\n",
       "92       2      593     3.0  835355511\n",
       "93       2      616     3.0  835355932\n",
       "94       2      661     4.0  835356141\n",
       "95       2      720     4.0  835355978\n",
       "\n",
       "[76 rows x 4 columns]"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings[ratings['userId'] == 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "_uuid": "0cb981abe36b30f8f27b4c5dc1b1d0e090431651"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Prediction(uid=1, iid=302, r_ui=3, est=2.7593729618356786, details={'was_impossible': False})"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svd.predict(1, 302, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}